In [1]:
# Required Packages
import pandas as pd
import numpy as np

# Modeling
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
# from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

# preprocessing
from sklearn.impute import SimpleImputer

# Visualisation libraries

## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex

## progress bar
import progressbar

## seaborn
import seaborn as sns
sns.set_context('paper', rc={'font.size':12,'axes.titlesize':14,'axes.labelsize':12})
sns.set_style('white')

## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import matplotlib.colors
from pylab import rcParams
plt.style.use('seaborn-whitegrid')
plt.rcParams['figure.figsize'] = 14, 8
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline

## plotly
from plotly.offline import init_notebook_mode, iplot 
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
%config InlineBackend.figure_format = 'retina' 

## missingno
import missingno as msno

import warnings
warnings.filterwarnings("ignore")

Starbucks Offer Personalizations

In this article, we investigate a set simulated dataset that mimics customer behavior on the Starbucks rewards mobile app. Starbucks tends to send out offers to users of the mobile app once every few days. These offers are exclusive, that is not all users receive the same offer. An offer can contain a discount for their products or sometimes BOGO (buy one get one free). These offers have a validity period before the offer expires. The article here is inspired by a towardsdatascience.com article.

Loading the Datasets

In [2]:
def Line(N): return N*'='
def Header(Inp, Length = 120):
    print(Back.BLACK + Fore.CYAN + Style.NORMAL + '%s' % Inp + Style.RESET_ALL
         + Fore.BLUE + Style.NORMAL + ' %s' % Line(Length- len(Inp) - 1) + Style.RESET_ALL)
def Bottom(Length = 120):
    print(Fore.BLUE + Style.NORMAL + '%s' % Line(Length) + Style.RESET_ALL)
    
# Portfolio Dataset
Header('Portfolio Dataset:')
Portfolio = pd.read_csv('StarBucks/Portfolio_Clean.csv')
display(Portfolio.head().style.hide_index())

# Profile Dataset
Header('Profile Dataset:')
Profile = pd.read_csv('StarBucks/Profile_Clean.csv')
display(Profile.head().style.hide_index())

# Transcript Dataset
Header('Transcript Dataset:')
Transcript = pd.read_csv('StarBucks/Transcript_Clean.csv')
display(Transcript.head().style.hide_index())
Bottom()

User_Data = pd.read_csv('StarBucks/User_Data.csv')
Data = pd.read_csv('StarBucks/Data.csv')
Portfolio Dataset: =====================================================================================================
Reward Difficulty Duration Offer_Type Offer_ID Email Mobile Social Web
10 10 7 BOGO ae264e3637204a6fb9bb56bc8210ddfd 1 1 1 0
10 10 5 BOGO 4d5c57ea9a6940dd891ad53e9dbe8da0 1 1 1 1
0 0 4 Informational 3f207df678b143eea3cee63160fa8bed 1 1 0 1
5 5 7 BOGO 9b98b8c7a33c4b65b9aebfe6a799e6d9 1 1 0 1
5 20 10 Discount 0b1e1539f2cc45b7b9fa7c272da2e1d7 1 0 0 1
Profile Dataset: =======================================================================================================
Gender Age ID Became_Member_On Income Member_Since_Year Member_Tenure
Other 55.000000 68be06ca386d4c31939f3a4f0e3dd783 2017-02-12 64000.000000 2017 23.000000
Female 55.000000 0610b486422d4921ae7d2bf64640c50b 2017-07-15 112000.000000 2017 18.000000
Other 55.000000 38fe809add3b4fcf9315a9694bb96ff5 2018-07-12 64000.000000 2018 6.000000
Female 75.000000 78afa995795e4d85b5d9ceeca43f5fef 2017-05-09 100000.000000 2017 20.000000
Other 55.000000 a03223e636434f42ac4c3df47e8bac43 2017-08-04 64000.000000 2017 17.000000
Transcript Dataset: ====================================================================================================
Person Event Value Time Amount Reward Offer_ID
78afa995795e4d85b5d9ceeca43f5fef Offer Received {'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'} 0 nan nan 9b98b8c7a33c4b65b9aebfe6a799e6d9
a03223e636434f42ac4c3df47e8bac43 Offer Received {'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'} 0 nan nan 0b1e1539f2cc45b7b9fa7c272da2e1d7
e2127556f4f64592b11af22de27a7932 Offer Received {'offer id': '2906b810c7d4411798c6938adc9daaa5'} 0 nan nan 2906b810c7d4411798c6938adc9daaa5
8ec6ce2a7e7949b1bf142def7d0e0586 Offer Received {'offer id': 'fafdcd668e3743c1bb461111dcafc2a4'} 0 nan nan fafdcd668e3743c1bb461111dcafc2a4
68617ca6246f4fbc85e91a2a49552598 Offer Received {'offer id': '4d5c57ea9a6940dd891ad53e9dbe8da0'} 0 nan nan 4d5c57ea9a6940dd891ad53e9dbe8da0
========================================================================================================================

Modeling

The object of the exercise is determining the best offer type for a given user. This can be done via a classification method that provides a probability as well. Here we use the sklearn MultiOutputClassifier with RandomForestClassifier for our modeling.

In [3]:
User_Data = User_Data.drop(['No_Offer','BOGO_comp','Info_comp','Disc_comp',
                            'Tot_Rewards_Rec','Offer_Difficulty'], axis=1)

Correlation

In [4]:
def Correlation_Plot (Df,Fig_Size):
    Correlation_Matrix = Df.corr().round(2)
    mask = np.zeros_like(Correlation_Matrix)
    mask[np.triu_indices_from(mask)] = True
    for i in range(len(mask)):
        mask[i,i]=0
    Fig, ax = plt.subplots(figsize=(Fig_Size,Fig_Size))
    sns.heatmap(Correlation_Matrix, ax=ax, mask=mask, annot=True, square=True, 
                cmap =sns.color_palette("RdYlGn", n_colors=10), linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": .5})
    bottom, top = ax.get_ylim()

Correlation_Plot(User_Data,14)
In [5]:
Target = {'BOGO_offer':'BOGO Offers', 'Disc_offer': 'Discount Offers','Info_offer':'Informational Offers'}
X= User_Data.drop(columns = list(Target.keys()))
y = User_Data[list(Target.keys())]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
               'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).set_index('Set').T
Out[5]:
Set X_train X_test y_train y_test
Shape (11900, 22) (5100, 22) (11900, 3) (5100, 3)
In [6]:
# Random Forest Classifier using 100 estimators
clf = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, oob_score=True))
_ = clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)
In [7]:
display(pd.DataFrame(metrics.classification_report(y_test,clf.predict(X_test),
                                                   output_dict = True, target_names = Target)).round(4))
BOGO_offer Disc_offer Info_offer micro avg macro avg weighted avg samples avg
precision 0.9219 0.9539 0.9720 0.9462 0.9492 0.9467 0.7285
recall 0.9778 0.9769 0.9668 0.9749 0.9738 0.9749 0.7459
f1-score 0.9490 0.9653 0.9694 0.9603 0.9612 0.9604 0.7328
support 2293.0000 2603.0000 1506.0000 6402.0000 6402.0000 6402.0000 6402.0000

Important Features

In [8]:
Feat_Dict = {'BOGO_Offer_Rec':'BOGO Offer Received', 'Difficulty_per_Offer':'Difficulty per Offer',
             'Disc_Offer_Rec':'Discount Offer Received', 'Gender_Female':'Gender: Female',
             'Gender_Male':'Gender: Male', 'Gender_Other':'Gender: Other', 'Info_Offer_Rec':'Informational Offer Received',
             'Member_Tenure':'Member Tenure', 'Offer_Comp_Rec_Ratio': 'Offer Completed Receive Ratio',
             'Offer_Comp_View_Ratio':'Viewed Offer Completed Ratio', 'Offer_Tran_Cnt_Ratio':'Offer Transaction Count Ratio' ,
             'Offer_Trans_Amnt':'Offer Transaction Amount', 'Offer_Trans_Amnt_Ratio':'Offer Transaction Amount Ratio',
             'Offer_View': 'Viewed Offer', 'Reward_per_Offer': 'Reward per Offer',
             'Tot_Tran_Amnt':'Total Transaction Amount', 'Tot_Tran_Cnt':'otal Transaction Count',
             'Tran_Amnt_per_Offer':'Transactions Amount per Offer','offer_comp': 'Offer Completed'}
Temp = list(Target.values())
Results = pd.DataFrame()

for i in range(len(Temp)):
    Temp0 = pd.DataFrame(data = clf.estimators_[i].feature_importances_, index = X_train.columns, columns = ['Importance'])
    Temp0['Target'] = Temp[i]
    Results = Results.append(Temp0)
del Temp0

Results = Results.reset_index(drop = False).rename(columns = {'index':'Features'})

Temp = pd.pivot_table(Results, values='Importance', index=['Features'], aggfunc=np.mean, fill_value=0).reset_index(drop = False)
Temp['Target'] = 'Overall'
Results = pd.concat([Results, Temp])
del Temp
Results['Features'] = Results['Features'].replace(Feat_Dict)

Colors = ['LightBlue', 'DeepSkyBlue', 'CornFlowerBlue', 'OrangeRed']
fig = px.bar(Results, x='Features', y='Importance', orientation='v',
             color = 'Target', color_discrete_sequence= Colors, barmode='group', height= 600)
fig.update_traces(marker_line_color= 'navy', marker_line_width=0.8, opacity=1)
fig.update_traces(marker_line_color= 'Navy', marker_line_width=1.2, opacity=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(legend_orientation='h', plot_bgcolor= 'white',
                  legend_title_text=None, legend=dict(x=0,y=1.1, bordercolor="Black", borderwidth=1))
fig['layout']['yaxis'].update(range=[0, .3])
fig.update_layout(title={'text': 'Classification Feature Importance',
                         'x':0.5, 'y':1,
                         'xanchor': 'center', 'yanchor': 'top'})
fig.show()

Predictions

Our model can predict multiple offer types for a given customer, and then sort the recommendations based on a higher probability of conversion. Then, we can find the most suitable offer for the customer.

In [9]:
def Best_Offer (ID, Model=clf):
    # Input: ID
    # all avialable offers
    Offers = [x.replace(' Offers','') for x in Target.values()]
    # the probability for given user(s)
    Predicted_Prop = clf.predict_proba(X.loc[ID])
    # Best offers for given user(s)
    Class_Predicted_Prop = clf.predict(X.loc[ID])
    # Prediccted probility for each offer type    
    Offer_Predicted_Prop = [[Predicted_Prop[c][i][1] for c in range(len(Offers)) ] for i in range(len(ID))]
    Offer_Predicted_Prop = np.array(Offer_Predicted_Prop)
    # Best_Offer_List
    Best_Offer_List = []
    for user in range(len(Offer_Predicted_Prop)):
        if Class_Predicted_Prop[user].sum()>0:
            # the index where predicted offer = 1
            Predicted_Class_ID = np.argwhere(Class_Predicted_Prop[user]==1).flatten()
            Prop_ID_Sort = np.argsort(-Offer_Predicted_Prop[user])
            # sorting probability in descending order to pick the most suitable one
            Best_Index = [i for i in Prop_ID_Sort if i in Predicted_Class_ID]
            Best_Offer_List.append([Offers[i] for i in Best_Index])
        else:
            Best_Offer_List.append('No offer is recommended')
        
    return pd.DataFrame(data={'Customers': Data['Person'].loc[ID].values, 'Most Suitable Offers': Best_Offer_List})


def Estimated_Probabilities(ID, Model=clf):
    Prop = pd.DataFrame()
    for i in range(len(Target.values())):
        Prop = pd.concat([Prop,  pd.DataFrame(clf.predict_proba(X.loc[ID])[i])], axis =1)

    Temp = []
    for x in [x + ' Probability' for x in Target.values()]:
        Temp.append(x)
        Temp.append(x)
    header = [np.array(Temp, dtype = str), 
             np.array([0,1]*3)] 
    Prop = pd.DataFrame(Prop.values, columns = header )
    Prop.index = Data['Person'].loc[ID].values
    return Prop

For example consider a random list of ten customers. We have,

In [10]:
n=10
ID = np.random.choice(X_test.index, n)
display(Best_Offer (ID))

display(Estimated_Probabilities(ID))
Customers Most Suitable Offers
0 37e765f48ac840a4b6de70c953d6576f [Informational, Discount]
1 80277bc9a67b48a2868c740036d7cbd4 [Informational, BOGO]
2 f37d9566b54547d8bdc64cc949ddab90 [Discount, Informational, BOGO]
3 951fab8598b540cda53df4c2d43ad1d7 [BOGO, Discount]
4 95218cd01a884f8a8eab89747f0ef8da No offer is recommended
5 63fcd3b7b0e64166ac6b8d6cc4c5f23e [Informational, Discount]
6 e27d828d67894aa58a1e250ff5feb166 [Informational, BOGO]
7 1e211df9e98f4147a9626ab9d2da45a4 [BOGO, Discount]
8 3e193daf33dd4b809a90d90d0820ab0e [Discount, Informational, BOGO]
9 cc44cc35b0f044aaa9f174549fe86577 [Informational, Discount]
BOGO Offers Probability Discount Offers Probability Informational Offers Probability
0 1 0 1 0 1
37e765f48ac840a4b6de70c953d6576f 0.90 0.10 0.01 0.99 0.00 1.00
80277bc9a67b48a2868c740036d7cbd4 0.18 0.82 0.84 0.16 0.00 1.00
f37d9566b54547d8bdc64cc949ddab90 0.37 0.63 0.10 0.90 0.10 0.90
951fab8598b540cda53df4c2d43ad1d7 0.03 0.97 0.18 0.82 0.98 0.02
95218cd01a884f8a8eab89747f0ef8da 1.00 0.00 1.00 0.00 1.00 0.00
63fcd3b7b0e64166ac6b8d6cc4c5f23e 0.98 0.02 0.02 0.98 0.00 1.00
e27d828d67894aa58a1e250ff5feb166 0.15 0.85 0.91 0.09 0.10 0.90
1e211df9e98f4147a9626ab9d2da45a4 0.00 1.00 0.01 0.99 0.99 0.01
3e193daf33dd4b809a90d90d0820ab0e 0.33 0.67 0.00 1.00 0.09 0.91
cc44cc35b0f044aaa9f174549fe86577 1.00 0.00 0.02 0.98 0.00 1.00